[My Github link] https://github.com/sydneydlu98/DSI_Data_Challenge_5

Loading/Cleaning Data and Exploratory Analysis

In this Data Challenge, we will be clustering foods from the nndb_flat dataset provided on Canvas. To load/clean the data as well as perform some exploratory analysis:

  1. Read in the data
## load all the packages
library(readr)
library(dplyr)
library(GGally)
library(tidyverse)
library(plotly)
library(ggplot2)

## read in data
data <- read_csv("nndb_flat.csv")
  1. We will be dealing with only data that falls under the food groups of Vegetables and Vegetable Products, Beef Products, and Sweets. Filter the data to contain only these food groups.
## filter the data to only contain food groups of Vegetables and Vegetable Products, Beef Products, and Sweets
object <-
  c("Sweets", "Beef Products", "Vegetables and Vegetable Products")

clean_data <- data %>%
  subset(FoodGroup %in% object)
  1. Select only the variables from Energy_kcal to Zinc_mg
var <- clean_data %>%
  select(Energy_kcal:Zinc_mg)
  1. Examine the correlation among the variables using GGally::ggcorr. Which variables have a high correlation?
GGally::ggcorr(
  var,
  size = 3.2,
  label = TRUE,
  label_size = 2.7,
  hjust = .9,
  layout.exp = 2
) 

If the coefficient value lies between ± 0.50 and ± 1, then it is said to be a strong correlation. If the value is near ± 1, then it said to be a perfect correlation: as one variable increases, the other variable tends to also increase (if positive) or decrease (if negative).

We can see from the plot, we do not have any high negative correlation, but we do have many high positive correlation. Such as the correlation coefficient between Folate_mcg and Thiamin_mg is 1, which is perfect positive correlation, and others like Protein_g and Zinc_mg has a correlation coefficient of 0.9 which is considered high correlation.

Performing PCA

Steps for performing the PCA on the data:

  1. Perform PCA on the data. Don’t forget to scale the data (if it is appropriate for this application)!
data_scaled <- scale(var)

pca_data <- prcomp(data_scaled, 
                   center = FALSE, 
                   scale. = FALSE)

summary(pca_data)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     2.3024 1.8455 1.6577 1.6205 1.41770 1.03167 0.9922
## Proportion of Variance 0.2305 0.1481 0.1195 0.1142 0.08739 0.04628 0.0428
## Cumulative Proportion  0.2305 0.3785 0.4980 0.6122 0.69960 0.74587 0.7887
##                            PC8     PC9   PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.90473 0.85456 0.8279 0.74550 0.71398 0.61241 0.58307
## Proportion of Variance 0.03559 0.03175 0.0298 0.02416 0.02216 0.01631 0.01478
## Cumulative Proportion  0.82426 0.85602 0.8858 0.90998 0.93214 0.94845 0.96323
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.46385 0.42317 0.38016 0.31599 0.27229 0.23596 0.22717
## Proportion of Variance 0.00935 0.00779 0.00628 0.00434 0.00322 0.00242 0.00224
## Cumulative Proportion  0.97258 0.98037 0.98665 0.99099 0.99422 0.99664 0.99888
##                          PC22    PC23
## Standard deviation     0.1440 0.07045
## Proportion of Variance 0.0009 0.00022
## Cumulative Proportion  0.9998 1.00000
  1. Makat1e a plot showing the cumulative proportion of the variation explained by each PC with cumulative variation explained on the y-axis and PC on the x-axis.
var_explainded <- summary(pca_data)$importance[2, ]
cumulative <- cumsum(var_explainded)

var_explained_df <- data.frame(PC = 1:23,
                               var_explainded = var_explainded,
                               cum_var_explained = cumulative)

var_explained_df
var_explained_df %>%
  ggplot(aes(x = PC, y = cum_var_explained, group = 1)) +
  geom_point() +
  geom_line(lwd = 1) +
  labs(title = "Scree Plot: PCA",
       y = 'Cumulative Variation Explained',
       x = 'PC') +
  ggtitle("Plot of cumulative proportion of the variation explained by each PC") +
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  # add line for better visualization
  geom_vline(xintercept = 3,
             linetype = 2,
             lwd = 1,
             col = "red") 

  1. We will look at the first 3 PCs which explain about 60% of the variation in the data. Note that you may want to look at more depending on what your application is. Make 3 separate plots for the loadings for the first 3 PCs for all of the variables, ordered by the absolute value of the magnitude of the loadings.
pca_loadings <- as.data.frame(pca_data$rotation) %>%
  dplyr::select(PC1, PC2, PC3) %>%
  mutate(variable = rownames(pca_data$rotation))


pc1 <- ggplot(pca_loadings, aes(x = variable,
                                y = abs(PC1))) +
  geom_bar(stat = 'identity', fill = "#FF6666") +
  theme(axis.text.x = element_text(
    angle = 50,
    hjust = 1,
    size = 13
  )) +
  ggtitle("Plot for the loadings of PC1 for all of the variables") +
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  labs(y = "Loadings", x = "Variables")

pc1

pc2 <- ggplot(pca_loadings, aes(x = variable,
                                y = abs(PC2))) +
  geom_bar(stat = 'identity', fill = "darkgreen") +
  theme(axis.text.x = element_text(
    angle = 50,
    hjust = 1,
    size = 13
  )) +
  ggtitle("Plot for the loadings of PC2 for all of the variables") +
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  labs(y = "Loadings", x = "Variables")

pc2

pc3 <- ggplot(pca_loadings, aes(x = variable,
                                y = abs(PC3))) +
  geom_bar(stat = 'identity', fill = "blue") +
  theme(axis.text.x = element_text(
    angle = 50,
    hjust = 1,
    size = 13
  )) +
  ggtitle("Plot for the loadings of PC3 for all of the variables") +
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  labs(y = "Loadings", x = "Variables")

pc3

  1. Make 3 plots of the scores on the PCs colored by food group. Plot the below scores. Make the plots interactive with plotly so you can identify the food description of any outliers.
pca_scores <- as.data.frame(pca_data$x)

pca_scores <- pca_scores %>%
  mutate(FoodGroup = clean_data$FoodGroup) %>%
  mutate(description = clean_data$ShortDescrip)

pca_scores
    1. PC1 versus PC2
plot1 <- ggplot(pca_scores,
                aes(
                  x = PC1,
                  y = PC2,
                  col = FoodGroup,
                  label = description
                )) +
  geom_point() +
  ggtitle("PC1 versus PC2") +
  theme(plot.title = element_text(hjust = 0.5, size = 16))
  
ggplotly(plot1)
    1. PC1 versus PC3
plot2 <- ggplot(pca_scores, aes(x = PC1,
                                y = PC3,
                                col = FoodGroup,
                                label = description)) +
  geom_point() +
  ggtitle("PC1 versus PC3") +
  theme(plot.title = element_text(hjust = 0.5, size = 16))

ggplotly(plot2)
    1. PC2 versus PC3
plot3 <- ggplot(pca_scores, aes(x = PC2, 
                       y = PC3, 
                       col = FoodGroup,
                       label = description)) +
  geom_point() +
  ggtitle("PC2 versus PC3") +
  theme(plot.title = element_text(hjust = 0.5, size = 16))

ggplotly(plot3)

Identify Outlier and Performing PCA Again

  1. There is a major outlier on the plots above – which food is the outlier? Remove the outlier from your data.

The major outlier on the plots above is yeast extract spread from the food group vegetable and vegatable products.

Then we remove this outlier.

## remove the outlier I identified above
complete_data <- clean_data %>%
  filter(ShortDescrip != "YEAST EXTRACT SPREAD")
  1. Perform PCA again on the dataset without the outlier (steps 1-4 in the Performing PCA section above) and look at the loadings of the first 3 PCs. Have these changed? Investigate and comment on what could have caused any changes.
var_new <- complete_data %>%
  select(Energy_kcal:Zinc_mg)

data_scaled_new <- scale(var_new)

pca_data_new <- prcomp(data_scaled_new,
                       center = FALSE,
                       scale. = FALSE)

summary(pca_data_new)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     2.4047 1.9174 1.6360 1.5452 1.05222 1.02786 0.92913
## Proportion of Variance 0.2514 0.1598 0.1164 0.1038 0.04814 0.04593 0.03753
## Cumulative Proportion  0.2514 0.4113 0.5276 0.6314 0.67959 0.72552 0.76306
##                            PC8    PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.91925 0.8348 0.78281 0.76116 0.72114 0.70611 0.59976
## Proportion of Variance 0.03674 0.0303 0.02664 0.02519 0.02261 0.02168 0.01564
## Cumulative Proportion  0.79980 0.8301 0.85674 0.88193 0.90454 0.92621 0.94185
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.58556 0.51769 0.43258 0.42121 0.35832 0.31552 0.27864
## Proportion of Variance 0.01491 0.01165 0.00814 0.00771 0.00558 0.00433 0.00338
## Cumulative Proportion  0.95676 0.96841 0.97655 0.98426 0.98985 0.99417 0.99755
##                           PC22    PC23
## Standard deviation     0.22669 0.07037
## Proportion of Variance 0.00223 0.00022
## Cumulative Proportion  0.99978 1.00000
var_explainded_new <- summary(pca_data_new)$importance[2,]
cumulative_new <- cumsum(var_explainded_new)

var_explained_df_new <- data.frame(PC = 1:23,
                                   var_explainded = var_explainded_new, 
                                   cum_var_explained = cumulative_new)

var_explained_df_new
var_explained_df_new %>%
  ggplot(aes(x = PC, y = cum_var_explained, group = 1)) +
  geom_point() +
  geom_line(lwd = 1) +
  labs(title = "Scree Plot: PCA",
       y = 'Cumulative Variation Explained',
       x = 'PC') +
  ggtitle("Plot of cumulative proportion of the variation explained by each PC \n (without outlier)") +
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  # add line for better visualization
  geom_vline(
    xintercept = 3,
    linetype = 2,
    lwd = 1,
    col = "red"
  )

pca_loadings_new <- as.data.frame(pca_data_new$rotation) %>%
  dplyr::select(PC1, PC2, PC3) %>%
  mutate(variable = rownames(pca_data_new$rotation))


pc1_new <- ggplot(pca_loadings_new, aes(x = variable,
                                y = abs(PC1))) +
  geom_bar(stat = 'identity', fill = "#FF6666") +
  theme(axis.text.x = element_text(
    angle = 50,
    hjust = 1,
    size = 13
  )) +
  ggtitle("Plot for the loadings of PC1 for all of the variables \n (without outlier)") +
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  labs(y = "Loadings", x = "Variables")

pc1_new

pc2_new <- ggplot(pca_loadings_new, aes(x = variable,
                                y = abs(PC2))) +
  geom_bar(stat = 'identity', fill = "darkgreen") +
  theme(axis.text.x = element_text(
    angle = 50,
    hjust = 1,
    size = 13
  )) +
  ggtitle("Plot for the loadings of PC2 for all of the variables \n (without outlier)") +
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  labs(y = "Loadings", x = "Variables")

pc2_new

pc3_new <- ggplot(pca_loadings_new, aes(x = variable,
                                y = abs(PC3))) +
  geom_bar(stat = 'identity', fill = "blue") +
  theme(axis.text.x = element_text(
    angle = 50,
    hjust = 1,
    size = 13
  )) +
  ggtitle("Plot for the loadings of PC3 for all of the variables \n (without outlier)") +
  theme(plot.title = element_text(hjust = 0.5, size = 16)) +
  labs(y = "Loadings", x = "Variables")

pc3_new

pca_scores_new <- as.data.frame(pca_data_new$x)

pca_scores_new <- pca_scores_new %>%
  mutate(FoodGroup = complete_data$FoodGroup) %>%
  mutate(description = complete_data$ShortDescrip)

pca_scores_new
plot1_new <- ggplot(pca_scores_new,
                    aes(
                      x = PC1,
                      y = PC2,
                      col = FoodGroup,
                      label = description
                    )) +
  geom_point() +
  ggtitle("PC1 versus PC2 (without outlier)") +
  theme(plot.title = element_text(hjust = 0.5, size = 16))

ggplotly(plot1_new)
plot2_new <- ggplot(pca_scores_new,
                    aes(
                      x = PC1,
                      y = PC3,
                      col = FoodGroup,
                      label = description
                    )) +
  geom_point() +
  ggtitle("PC1 versus PC3 (without outlier)") +
  theme(plot.title = element_text(hjust = 0.5, size = 16))

ggplotly(plot2_new)
plot3_new <- ggplot(pca_scores_new,
                    aes(
                      x = PC2,
                      y = PC3,
                      col = FoodGroup,
                      label = description
                    )) +
  geom_point() +
  ggtitle("PC2 versus PC3 (without outlier)") +
  theme(plot.title = element_text(hjust = 0.5, size = 16))

ggplotly(plot3_new)
  1. Describe what you see in the plots of the scores and interpret this in conjunction with the loadings that you observed for the PCs.